# Load libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ✔ readr     2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
# ============================================================================
# Load data
Alzheimer <- read.csv('project data.csv')
head(Alzheimer)
##         Group M.F Age EDUC SES MMSE CDR eTIV  nWBV   ASF
## 1 Nondemented   M  87   14   2   27 0.0 1987 0.696 0.883
## 2 Nondemented   M  88   14   2   30 0.0 2004 0.681 0.876
## 3    Demented   M  75   12  NA   23 0.5 1678 0.736 1.046
## 4    Demented   M  76   12  NA   28 0.5 1738 0.713 1.010
## 5    Demented   M  80   12  NA   22 0.5 1698 0.701 1.034
## 6 Nondemented   F  88   18   3   28 0.0 1215 0.710 1.444
str(Alzheimer)
## 'data.frame':    373 obs. of  10 variables:
##  $ Group: chr  "Nondemented" "Nondemented" "Demented" "Demented" ...
##  $ M.F  : chr  "M" "M" "M" "M" ...
##  $ Age  : int  87 88 75 76 80 88 90 80 83 85 ...
##  $ EDUC : int  14 14 12 12 12 18 18 12 12 12 ...
##  $ SES  : int  2 2 NA NA NA 3 3 4 4 4 ...
##  $ MMSE : int  27 30 23 28 22 28 27 28 29 30 ...
##  $ CDR  : num  0 0 0.5 0.5 0.5 0 0 0 0.5 0 ...
##  $ eTIV : int  1987 2004 1678 1738 1698 1215 1200 1689 1701 1699 ...
##  $ nWBV : num  0.696 0.681 0.736 0.713 0.701 0.71 0.718 0.712 0.711 0.705 ...
##  $ ASF  : num  0.883 0.876 1.046 1.01 1.034 ...
# Preliminary Analysis
# Convert M/F into numeric values
Alzheimer$M.F <- ifelse(Alzheimer$M.F == 'M', 1, 
                        ifelse(Alzheimer$M.F == 'F', 0, NA))

# Confirm the conversion
head(Alzheimer)
##         Group M.F Age EDUC SES MMSE CDR eTIV  nWBV   ASF
## 1 Nondemented   1  87   14   2   27 0.0 1987 0.696 0.883
## 2 Nondemented   1  88   14   2   30 0.0 2004 0.681 0.876
## 3    Demented   1  75   12  NA   23 0.5 1678 0.736 1.046
## 4    Demented   1  76   12  NA   28 0.5 1738 0.713 1.010
## 5    Demented   1  80   12  NA   22 0.5 1698 0.701 1.034
## 6 Nondemented   0  88   18   3   28 0.0 1215 0.710 1.444
# Remove rows with Group = 'Converted'
Alzheimer <- Alzheimer %>%
  filter(Group != 'Converted')

# Remove missing values
Alzheimer <- na.omit(Alzheimer)

# Analysis
# Generate summary of Alzheimer
summary(Alzheimer)
##     Group                M.F              Age             EDUC      
##  Length:317         Min.   :0.0000   Min.   :60.00   Min.   : 6.00  
##  Class :character   1st Qu.:0.0000   1st Qu.:71.00   1st Qu.:12.00  
##  Mode  :character   Median :0.0000   Median :76.00   Median :15.00  
##                     Mean   :0.4322   Mean   :76.72   Mean   :14.62  
##                     3rd Qu.:1.0000   3rd Qu.:82.00   3rd Qu.:16.00  
##                     Max.   :1.0000   Max.   :98.00   Max.   :23.00  
##       SES             MMSE            CDR              eTIV     
##  Min.   :1.000   Min.   : 4.00   Min.   :0.0000   Min.   :1106  
##  1st Qu.:2.000   1st Qu.:27.00   1st Qu.:0.0000   1st Qu.:1358  
##  Median :2.000   Median :29.00   Median :0.0000   Median :1476  
##  Mean   :2.546   Mean   :27.26   Mean   :0.2729   Mean   :1494  
##  3rd Qu.:3.000   3rd Qu.:30.00   3rd Qu.:0.5000   3rd Qu.:1599  
##  Max.   :5.000   Max.   :30.00   Max.   :2.0000   Max.   :2004  
##       nWBV             ASF       
##  Min.   :0.6440   Min.   :0.876  
##  1st Qu.:0.7000   1st Qu.:1.098  
##  Median :0.7320   Median :1.189  
##  Mean   :0.7306   Mean   :1.192  
##  3rd Qu.:0.7570   3rd Qu.:1.293  
##  Max.   :0.8370   Max.   :1.587
# ========================================================================
# Select all numeric variables
attach(Alzheimer)
numeric_vars <- c('Age', 'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF')
numeric_vars
## [1] "Age"  "EDUC" "SES"  "MMSE" "CDR"  "eTIV" "nWBV" "ASF"
# Find standard deviation of variables
sds <- apply(Alzheimer[, numeric_vars], 2, sd)
print(sds)
##          Age         EDUC          SES         MMSE          CDR         eTIV 
##   7.80507137   2.92687640   1.12309861   3.86122732   0.38214372 179.71907893 
##         nWBV          ASF 
##   0.03810197   0.13966275
# Create appropriate plots
ggplot(Alzheimer,
       aes(x = Group, y = Age, fill = as.factor(M.F))) +
  geom_boxplot() +
  labs(x = 'Group', y = 'Age', 
       title = paste('Boxplot of Demented and Nondemented based on Age and Gender'), 
       fill = 'Gender (M.F)') +
  scale_fill_manual(
    values = c("0" = "tomato1", "1" = "lightseagreen" ),
    labels = c("0" = "Female", "1" = "Male"))

# Convert M.F to factor
Alzheimer$M.F <- as.factor(ifelse(M.F == 1, 'Male', 'Female'))

gender_G <- ggplot(Alzheimer,
                   aes(x = M.F,
                       fill = Group)) +
  geom_bar(position = 'dodge', color = 'black') +
  geom_text(aes(label = ..count..), stat = 'count', vjust = 0.5, colour = 'black') +
  labs(x = 'Gender', y = 'Frequency',
       title = paste('Barchart of Gender by Demented vs Nondemented')) 

ggplotly(gender_G)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## ℹ The deprecated feature was likely used in the ggplot2 package.
##   Please report the issue at <https://github.com/tidyverse/ggplot2/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# ========================================================================
# Convert Group variable to numeric values
Alzheimer$Group <- ifelse(Alzheimer$Group == 'Demented', 1,
                          ifelse(Alzheimer$Group == 'Nondemented', 0, NA))

# Convert M/F into numeric values
Alzheimer$M.F <- ifelse(Alzheimer$M.F == 'Male', 1, 
                        ifelse(Alzheimer$M.F == 'Female', 0, NA))
head(Alzheimer)
##   Group M.F Age EDUC SES MMSE CDR eTIV  nWBV   ASF
## 1     0   1  87   14   2   27 0.0 1987 0.696 0.883
## 2     0   1  88   14   2   30 0.0 2004 0.681 0.876
## 6     0   0  88   18   3   28 0.0 1215 0.710 1.444
## 7     0   0  90   18   3   27 0.0 1200 0.718 1.462
## 8     0   1  80   12   4   28 0.0 1689 0.712 1.039
## 9     0   1  83   12   4   29 0.5 1701 0.711 1.032
# Similarity measure
distance.Euclidean <- get_dist(Alzheimer)
fviz_dist(distance.Euclidean, 
          gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

distance.corr <- get_dist(Alzheimer, stand = TRUE, method = "pearson")
fviz_dist(distance.corr, 
          gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

# Standardize features
scaled_A_vars <- scale(Alzheimer)

# Determining the optimal number of clusters
fviz_nbclust(scaled_A_vars, kmeans, method = "wss")+
  geom_vline(xintercept = 3, linetype = 2)

# K-Means Clustering
set.seed(123)
kmeans2 <- kmeans(scaled_A_vars, centers = 2, nstart = 20)
kmeans3 <- kmeans(scaled_A_vars, centers = 3, nstart = 20)
kmeans4 <- kmeans(scaled_A_vars, centers = 4, nstart = 20)
kmeans3
## K-means clustering with 3 clusters of sizes 102, 138, 77
## 
## Cluster means:
##        Group        M.F         Age       EDUC          SES       MMSE
## 1  1.2212058  0.2552531 -0.01010028 -0.5384328  0.430657146 -0.9691794
## 2 -0.7276933 -0.6811764 -0.17716110 -0.0244847  0.004437111  0.5027078
## 3 -0.3135235  0.8826822  0.33088909  0.7571303 -0.578433119  0.3828912
##          CDR       eTIV       nWBV        ASF
## 1  1.1331121 -0.2233269 -0.5233154  0.1817558
## 2 -0.6571651 -0.5477012  0.5301795  0.5347985
## 3 -0.3232292  1.2774301 -0.2569688 -1.1992374
## 
## Clustering vector:
##   1   2   6   7   8   9  10  14  15  16  17  18  19  20  21  22  23  24  25  26 
##   3   3   2   2   3   3   3   2   2   1   1   2   1   2   2   2   2   2   2   1 
##  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46 
##   1   1   1   3   3   3   3   3   3   2   2   1   1   1   1   2   2   2   2   1 
##  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66 
##   1   2   2   2   2   1   1   2   2   2   2   2   2   2   2   2   2   1   1   1 
##  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86 
##   1   2   1   1   1   1   2   2   1   1   1   1   1   2   2   1   1   2   2   1 
##  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 
##   1   1   1   1   2   2   2   1   1   3   3   3   3   3   2   2   2   2   2   2 
## 107 108 109 110 111 112 113 114 115 116 117 118 119 120 123 124 125 126 127 128 
##   2   2   2   1   1   1   1   1   3   3   3   3   3   3   1   1   1   3   3   2 
## 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 
##   2   2   2   2   2   2   2   3   3   3   3   3   1   1   2   2   2   2   2   1 
## 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 
##   1   2   2   2   3   3   3   3   3   1   1   1   1   1   1   1   1   2   2   2 
## 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 
##   2   1   1   1   1   1   1   3   3   3   2   2   2   2   3   3   3   2   2   3 
## 189 190 191 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 
##   3   1   1   2   2   2   2   2   2   1   1   1   1   1   3   3   1   1   3   3 
## 211 212 213 214 215 216 217 218 221 222 223 224 225 226 227 228 229 230 231 232 
##   3   3   1   1   1   1   1   1   1   1   3   3   3   3   2   2   2   1   1   2 
## 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 
##   2   2   2   1   1   2   2   2   2   2   2   2   2   1   1   3   3   3   3   2 
## 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 
##   2   2   2   1   1   1   2   2   2   2   3   3   3   1   1   2   2   2   2   2 
## 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 291 292 293 294 
##   2   1   1   2   2   2   2   3   2   2   1   1   2   2   2   2   3   3   3   1 
## 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 
##   1   3   1   1   1   2   2   3   3   3   3   3   2   2   2   3   3   3   2   2 
## 315 316 317 318 319 325 326 327 328 329 330 331 332 333 334 335 336 
##   2   2   2   3   3   2   2   2   2   1   1   3   3   3   2   2   2 
## 
## Within cluster sum of squares by cluster:
## [1] 792.3855 693.8160 441.0180
##  (between_SS / total_SS =  39.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# To visualise the results the fviz_cluster function can be used:
fviz_cluster(kmeans2, data = scaled_A_vars, stand = FALSE)

fviz_cluster(kmeans3, data = scaled_A_vars, stand = FALSE)

fviz_cluster(kmeans4, data = scaled_A_vars, stand = FALSE)

f1 <- fviz_cluster(kmeans2, 
                   geom = "point", data = scaled_A_vars) + ggtitle("k = 2")
f2 <- fviz_cluster(kmeans3, 
                   geom = "point", data = scaled_A_vars) + ggtitle("k = 3")
f3 <- fviz_cluster(kmeans4, 
                   geom = "point", data = scaled_A_vars) + ggtitle("k = 4")
grid.arrange(f1, f2, f3, nrow = 2)

# ========================================================================
# Implement feature selection on the data set 
attach(Alzheimer)
## The following objects are masked from Alzheimer (pos = 3):
## 
##     Age, ASF, CDR, EDUC, eTIV, Group, M.F, MMSE, nWBV, SES
y_Group <- as.numeric(Alzheimer[,1])
X <- Alzheimer[,2:10]

model1 <- glm(y_Group~.,data=X)
summary(model1)
## 
## Call:
## glm(formula = y_Group ~ ., data = X)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.283e-01  1.560e+00   0.339   0.7351    
## M.F          1.602e-01  3.447e-02   4.649 4.95e-06 ***
## Age         -3.523e-03  2.135e-03  -1.650   0.0999 .  
## EDUC        -1.086e-02  6.920e-03  -1.570   0.1175    
## SES          1.039e-02  1.800e-02   0.577   0.5642    
## MMSE         5.481e-03  5.274e-03   1.039   0.2995    
## CDR          1.056e+00  5.234e-02  20.169  < 2e-16 ***
## eTIV        -2.187e-05  5.144e-04  -0.043   0.9661    
## nWBV        -9.447e-01  4.803e-01  -1.967   0.0501 .  
## ASF          4.122e-01  6.520e-01   0.632   0.5277    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.05730189)
## 
##     Null deviance: 76.120  on 316  degrees of freedom
## Residual deviance: 17.592  on 307  degrees of freedom
## AIC: 5.0092
## 
## Number of Fisher Scoring iterations: 2
step1 <- step(model1,method="backward")
## Start:  AIC=5.01
## y_Group ~ M.F + Age + EDUC + SES + MMSE + CDR + eTIV + nWBV + 
##     ASF
## 
##        Df Deviance     AIC
## - eTIV  1   17.592   3.011
## - SES   1   17.611   3.353
## - ASF   1   17.615   3.422
## - MMSE  1   17.654   4.122
## <none>      17.592   5.009
## - EDUC  1   17.733   5.544
## - Age   1   17.748   5.808
## - nWBV  1   17.813   6.980
## - M.F   1   18.830  24.578
## - CDR   1   40.901 270.474
## 
## Step:  AIC=3.01
## y_Group ~ M.F + Age + EDUC + SES + MMSE + CDR + nWBV + ASF
## 
##        Df Deviance     AIC
## - SES   1   17.611   1.354
## - MMSE  1   17.654   2.124
## <none>      17.592   3.011
## - EDUC  1   17.736   3.595
## - Age   1   17.751   3.866
## - nWBV  1   17.815   5.000
## - ASF   1   18.326  13.973
## - M.F   1   18.866  23.185
## - CDR   1   40.946 268.816
## 
## Step:  AIC=1.35
## y_Group ~ M.F + Age + EDUC + MMSE + CDR + nWBV + ASF
## 
##        Df Deviance     AIC
## - MMSE  1   17.670   0.414
## <none>      17.611   1.354
## - Age   1   17.770   2.212
## - nWBV  1   17.827   3.223
## - EDUC  1   18.058   7.311
## - ASF   1   18.410  13.431
## - M.F   1   18.938  22.381
## - CDR   1   40.951 266.861
## 
## Step:  AIC=0.41
## y_Group ~ M.F + Age + EDUC + CDR + nWBV + ASF
## 
##        Df Deviance    AIC
## <none>      17.670   0.41
## - Age   1   17.802   0.78
## - nWBV  1   17.845   1.55
## - EDUC  1   18.084   5.76
## - ASF   1   18.460  12.27
## - M.F   1   18.995  21.34
## - CDR   1   55.051 358.65
summary(step1)
## 
## Call:
## glm(formula = y_Group ~ M.F + Age + EDUC + CDR + nWBV + ASF, 
##     data = X)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.546490   0.484185   1.129 0.259906    
## M.F          0.162088   0.033617   4.822 2.24e-06 ***
## Age         -0.003178   0.002085  -1.524 0.128482    
## EDUC        -0.013056   0.004843  -2.696 0.007405 ** 
## CDR          1.020255   0.039840  25.609  < 2e-16 ***
## nWBV        -0.813532   0.463650  -1.755 0.080311 .  
## ASF          0.448722   0.120551   3.722 0.000234 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.05699945)
## 
##     Null deviance: 76.12  on 316  degrees of freedom
## Residual deviance: 17.67  on 310  degrees of freedom
## AIC: 0.41434
## 
## Number of Fisher Scoring iterations: 2
# y_Group ~ M.F + Age + EDUC + CDR + nWBV + ASF (Features selected)

model2 <- lm(y_Group~1,data=X)
step2 <- step(model2, 
              scope=~ M.F + Age + EDUC + SES + MMSE + CDR+ eTIV + nWBV + ASF,
            method="forward")
## Start:  AIC=-450.23
## y_Group ~ 1
## 
##        Df Sum of Sq    RSS     AIC
## + CDR   1    56.023 20.097 -870.39
## + MMSE  1    29.571 46.549 -604.13
## + nWBV  1     8.354 67.765 -485.08
## + M.F   1     5.730 70.389 -473.04
## + EDUC  1     3.703 72.417 -464.04
## + SES   1     2.065 74.055 -456.95
## <none>              76.120 -450.23
## + Age   1     0.219 75.901 -449.14
## + eTIV  1     0.013 76.107 -448.28
## + ASF   1     0.002 76.118 -448.24
## 
## Step:  AIC=-870.39
## y_Group ~ CDR
## 
##        Df Sum of Sq    RSS     AIC
## + M.F   1     0.700 19.398 -879.62
## + EDUC  1     0.668 19.429 -879.10
## + SES   1     0.623 19.474 -878.37
## + ASF   1     0.189 19.908 -871.38
## + eTIV  1     0.176 19.921 -871.17
## <none>              20.097 -870.39
## + nWBV  1     0.062 20.036 -869.36
## + Age   1     0.049 20.048 -869.17
## + MMSE  1     0.000 20.097 -868.39
## - CDR   1    56.023 76.120 -450.23
## 
## Step:  AIC=-879.62
## y_Group ~ CDR + M.F
## 
##        Df Sum of Sq    RSS     AIC
## + eTIV  1     1.157 18.240 -897.12
## + ASF   1     1.144 18.253 -896.89
## + EDUC  1     0.778 18.619 -890.60
## + SES   1     0.687 18.710 -889.06
## <none>              19.398 -879.62
## + Age   1     0.033 19.364 -878.16
## + nWBV  1     0.015 19.383 -877.86
## + MMSE  1     0.001 19.397 -877.63
## - M.F   1     0.700 20.097 -870.39
## - CDR   1    50.992 70.389 -473.04
## 
## Step:  AIC=-897.12
## y_Group ~ CDR + M.F + eTIV
## 
##        Df Sum of Sq    RSS     AIC
## + EDUC  1     0.359 17.881 -901.43
## + SES   1     0.257 17.983 -899.62
## <none>              18.240 -897.12
## + nWBV  1     0.071 18.170 -896.35
## + Age   1     0.009 18.231 -895.28
## + MMSE  1     0.004 18.236 -895.20
## + ASF   1     0.004 18.236 -895.19
## - eTIV  1     1.157 19.398 -879.62
## - M.F   1     1.681 19.921 -871.17
## - CDR   1    49.100 67.340 -485.08
## 
## Step:  AIC=-901.43
## y_Group ~ CDR + M.F + eTIV + EDUC
## 
##        Df Sum of Sq    RSS     AIC
## <none>              17.881 -901.43
## + nWBV  1     0.068 17.813 -900.63
## + Age   1     0.020 17.861 -899.78
## + MMSE  1     0.017 17.864 -899.74
## + SES   1     0.014 17.867 -899.68
## + ASF   1     0.013 17.868 -899.66
## - EDUC  1     0.359 18.240 -897.12
## - eTIV  1     0.738 18.619 -890.60
## - M.F   1     1.508 19.389 -877.76
## - CDR   1    46.972 64.853 -495.01
summary(step2)
## 
## Call:
## lm(formula = y_Group ~ CDR + M.F + eTIV + EDUC, data = X)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1731 -0.1229 -0.0542  0.2064  0.4758 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.257e-01  1.340e-01   5.417 1.21e-07 ***
## CDR          1.047e+00  3.657e-02  28.629  < 2e-16 ***
## M.F          1.732e-01  3.375e-02   5.130 5.10e-07 ***
## eTIV        -3.397e-04  9.464e-05  -3.589 0.000385 ***
## EDUC        -1.219e-02  4.870e-03  -2.504 0.012793 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2394 on 312 degrees of freedom
## Multiple R-squared:  0.7651, Adjusted R-squared:  0.7621 
## F-statistic:   254 on 4 and 312 DF,  p-value: < 2.2e-16
# y_Group ~ CDR + M.F + eTIV + EDUC (Features selected)
b_model <- lm(y_Group ~ M.F + Age + EDUC + CDR + nWBV + ASF)
summary(b_model)
## 
## Call:
## lm(formula = y_Group ~ M.F + Age + EDUC + CDR + nWBV + ASF)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.14908 -0.12500 -0.06085  0.19494  0.47964 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.546490   0.484185   1.129 0.259906    
## M.F          0.162088   0.033617   4.822 2.24e-06 ***
## Age         -0.003178   0.002085  -1.524 0.128482    
## EDUC        -0.013056   0.004843  -2.696 0.007405 ** 
## CDR          1.020255   0.039840  25.609  < 2e-16 ***
## nWBV        -0.813532   0.463650  -1.755 0.080311 .  
## ASF          0.448722   0.120551   3.722 0.000234 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2387 on 310 degrees of freedom
## Multiple R-squared:  0.7679, Adjusted R-squared:  0.7634 
## F-statistic: 170.9 on 6 and 310 DF,  p-value: < 2.2e-16
f_model <- lm(y_Group ~ CDR + M.F + eTIV + EDUC)
summary(f_model)
## 
## Call:
## lm(formula = y_Group ~ CDR + M.F + eTIV + EDUC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1731 -0.1229 -0.0542  0.2064  0.4758 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.257e-01  1.340e-01   5.417 1.21e-07 ***
## CDR          1.047e+00  3.657e-02  28.629  < 2e-16 ***
## M.F          1.732e-01  3.375e-02   5.130 5.10e-07 ***
## eTIV        -3.397e-04  9.464e-05  -3.589 0.000385 ***
## EDUC        -1.219e-02  4.870e-03  -2.504 0.012793 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2394 on 312 degrees of freedom
## Multiple R-squared:  0.7651, Adjusted R-squared:  0.7621 
## F-statistic:   254 on 4 and 312 DF,  p-value: < 2.2e-16
anova(b_model, f_model)
## Analysis of Variance Table
## 
## Model 1: y_Group ~ M.F + Age + EDUC + CDR + nWBV + ASF
## Model 2: y_Group ~ CDR + M.F + eTIV + EDUC
##   Res.Df    RSS Df Sum of Sq     F Pr(>F)
## 1    310 17.670                          
## 2    312 17.881 -2  -0.21112 1.852 0.1587
# ========================================================================
# Convert Group variable to factor
Alzheimer$Group <- as.factor(Alzheimer$Group)

# Cross Validation (CV)
# For 5-fold CV
trControl <- trainControl(method = "cv", number = 5)

#lda
lda.fit <- train(Group ~ CDR + M.F + eTIV + EDUC,
                 method = "lda",
                 trControl = trControl,
                 metric = "Accuracy",
                 data = Alzheimer)

lda.pred <- predict(lda.fit,Alzheimer)
t1 <- table(lda.pred, Alzheimer$Group)
confusionMatrix(t1)
## Confusion Matrix and Statistics
## 
##         
## lda.pred   0   1
##        0 188   0
##        1   2 127
##                                           
##                Accuracy : 0.9937          
##                  95% CI : (0.9774, 0.9992)
##     No Information Rate : 0.5994          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9869          
##                                           
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 0.9895          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9845          
##              Prevalence : 0.5994          
##          Detection Rate : 0.5931          
##    Detection Prevalence : 0.5931          
##       Balanced Accuracy : 0.9947          
##                                           
##        'Positive' Class : 0               
## 
# ========================================================================
#glm
glm.fit <- train(Group ~ CDR + M.F + eTIV + EDUC,
                 method = "glm",
                 trControl = trControl,
                 metric = "Accuracy",
                 data = Alzheimer)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.pred <- predict(glm.fit,Alzheimer)
t2 <- table(glm.pred, Alzheimer$Group)
confusionMatrix(t2)
## Confusion Matrix and Statistics
## 
##         
## glm.pred   0   1
##        0 188   0
##        1   2 127
##                                           
##                Accuracy : 0.9937          
##                  95% CI : (0.9774, 0.9992)
##     No Information Rate : 0.5994          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9869          
##                                           
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 0.9895          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9845          
##              Prevalence : 0.5994          
##          Detection Rate : 0.5931          
##    Detection Prevalence : 0.5931          
##       Balanced Accuracy : 0.9947          
##                                           
##        'Positive' Class : 0               
## 
# ========================================================================
#knn
knn.fit <- train(Group ~ CDR + M.F + eTIV + EDUC,
                 method = "knn",
                 tuneGrid = expand.grid(k = 1:10),
                 trControl = trControl,
                 metric = "Accuracy",
                 data = Alzheimer)

knn.pred <- predict(knn.fit,Alzheimer)
t4 <- table(knn.pred, Alzheimer$Group)
confusionMatrix(t4)
## Confusion Matrix and Statistics
## 
##         
## knn.pred   0   1
##        0 190   0
##        1   0 127
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9884, 1)
##     No Information Rate : 0.5994     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.5994     
##          Detection Rate : 0.5994     
##    Detection Prevalence : 0.5994     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
##